ISLR packages provide several datasets. The description of datasets are available here: http://cran.r-project.org/web/packages/ISLR/ISLR.pdf
rm(list = ls()) # clear the workspace
library(ISLR) # load ISLR data package
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
Examine the dataset. It contains four variables,
default, student,balance, and
income.
Default<-as_tibble(Default)
Default
glimpse(Default)
## Rows: 10,000
## Columns: 4
## $ default <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No, No, No~
## $ student <fct> No, Yes, No, No, No, Yes, No, Yes, No, No, Yes, Yes, No, No, N~
## $ balance <dbl> 729.5265, 817.1804, 1073.5492, 529.2506, 785.6559, 919.5885, 8~
## $ income <dbl> 44361.625, 12106.135, 31767.139, 35704.494, 38463.496, 7491.55~
head(Default) # show the first six rows
tail(Default) # show the last six rows
names(Default) # variable names
## [1] "default" "student" "balance" "income"
nrow(Default) # the number of rows
## [1] 10000
ncol(Default) # the number of columns
## [1] 4
summary(Default) # basic summary statistics of the variables in Default dataset (default, student, balance, income)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
# frequency table
summary(Default$default) # summary of default variable
## No Yes
## 9667 333
table(Default$default) # contingency table: frequency of each case (yes/no) in default variable
##
## No Yes
## 9667 333
table(Default$student) # contingency table: frequency of each case (yes/no) in student variable
##
## No Yes
## 7056 2944
table(Default$default, Default$student) # cross-tabulation
##
## No Yes
## No 6850 2817
## Yes 206 127
Default %>%
ggplot(aes(x=default,fill=default)) +
geom_bar()

Default %>%
ggplot(aes(x=student,fill=student)) +
geom_bar()

Default %>%
ggplot(aes(x=income)) +
geom_histogram(binwidth=1000, colour="black",fill="white")

Default %>%
ggplot(aes(x=income,fill=student)) +
geom_histogram(binwidth=1000,alpha=.5,position="identity")

Default %>%
ggplot(aes(x=income,fill=default)) +
geom_histogram(binwidth=1000,alpha=.5,position="identity")

ggplot(Default,aes(x=default,y=balance,fill=default))+geom_boxplot()

ggplot(Default,aes(x=default,y=income,fill=default))+geom_boxplot()

Default %>%
ggplot(aes(x=balance,y=income,color=default)) +
geom_point(shape=1)

We need rpart package for Classification Trees. More on rpart : http://cran.r-project.org/web/packages/rpart/vignettes/longintro.pdf
rpart.plot is a package for visualizing classification
tree models.
library(rpart)
library(rpart.plot)
Let’s build a model.
ct_model<-rpart(default~student+balance+income, # model formula
data=Default, # dataset
method="class", # "class" indicates a classification tree model
control=rpart.control(cp=0.03,maxdepth=4)) # tree control parameters.
Next, let’s visualize the tree model.
rpart.plot(ct_model) # tree plot

Here, I tried to visualize the classification tree model results on the scatterplot. It could be done, because the model used the two numeric variables.
Default %>%
ggplot(aes(x=balance,y=income,color=default)) +
geom_point(shape=1)+
geom_vline(xintercept=1800.002,linetype="dashed")+
geom_vline(xintercept=1971.915,linetype="dashed")+
geom_hline(yintercept=27401.2,linetype="dashed")+
annotate("rect",xmin=1800.002, xmax=1971.915, ymin=0, ymax=27401.2,fill="red",alpha=0.2)+
annotate("rect",xmin=1971.915, xmax=Inf, ymin=0, ymax=Inf,fill="blue",alpha=0.2)+
annotate("rect",xmin=0, xmax=1800.002, ymin=0, ymax=Inf,fill="red",alpha=0.2)+
annotate("rect",xmin=1800.002, xmax=1971.915, ymin=27401.2, ymax=Inf,fill="blue",alpha=0.2)

#print(ct_model) # model results
#summary(ct_model) # model result details
Get the predicted value - class membership (yes or no) –> using a cut-off of 50%.
ct_pred_class<-predict(ct_model,type="class") # class membership (yes or no)
head(ct_pred_class)
## 1 2 3 4 5 6
## No No No No No No
## Levels: No Yes
ct_pred<-predict(ct_model) # get the predicted values - class probabilities (default)
head(ct_pred)
## No Yes
## 1 0.9823929 0.01760708
## 2 0.9823929 0.01760708
## 3 0.9823929 0.01760708
## 4 0.9823929 0.01760708
## 5 0.9823929 0.01760708
## 6 0.9823929 0.01760708
Let’s create a new column in Default: save the predicted probability of default (yes) from the second column of dt_pred.
Default$ct_pred_prob<-ct_pred[,2]
Alternatively, you can specify a certain cut-off value to assign class membership. You can set the cut-off at 30%, 50%, 80%, or whatever you want.
Default$ct_pred_class<-ifelse(Default$ct_pred_prob>0.5,"Yes","No")
head(Default)
Default[253,] # get the information of 253th customer
# show the customers whose predicted probability is greater than 70%
Default%>%
filter(ct_pred_prob>0.7)
# sort customers by probability of default in descending order
Default%>%
arrange(desc(ct_pred_prob))
set.seed(1)
#install.packages("randomForest")
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
rf_model<-randomForest(default~income+balance+student, # model formula
data=Default,ntree=500, cutoff=c(0.5,0.5))
#print(rf_model)
head(rf_model$votes) # indicates the % of trees that voted for each class
## No Yes
## 1 1.0000000 0.000000000
## 2 1.0000000 0.000000000
## 3 0.9946809 0.005319149
## 4 1.0000000 0.000000000
## 5 1.0000000 0.000000000
## 6 1.0000000 0.000000000
head(rf_model$predicted) # the class favored by more trees (i.e. majority vote wins)
## 1 2 3 4 5 6
## No No No No No No
## Levels: No Yes
varImpPlot(rf_model) # importance of variables
head(rf_model$vote)
## No Yes
## 1 1.0000000 0.000000000
## 2 1.0000000 0.000000000
## 3 0.9946809 0.005319149
## 4 1.0000000 0.000000000
## 5 1.0000000 0.000000000
## 6 1.0000000 0.000000000
Default$rf_vote<-predict(rf_model,type="prob")[,2]
head(Default)
library(e1071)
model_svm<-svm(formula= default ~ balance+income+student, # model formula
data=Default, # dataset
kernel="linear", # this is the form of the decision boundary. Let's start with a linear kernel.
cost=0.1) # there are paremeters that are used to tune the model
model_svm
##
## Call:
## svm(formula = default ~ balance + income + student, data = Default,
## kernel = "linear", cost = 0.1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.1
##
## Number of Support Vectors: 672
dv<-data.frame(model_svm$decision.values)
ggplot(dv,aes(x=No.Yes)) +
geom_histogram(colour="black",fill="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
head(model_svm$fitted)
## 1 2 3 4 5 6
## No No No No No No
## Levels: No Yes
table(model_svm$fitted)
##
## No Yes
## 10000 0
predicted_svm<-predict(model_svm,Default,decision.values = TRUE)
head(attr(predicted_svm, "decision.values"))
## No/Yes
## 1 1.0000632
## 2 1.0004151
## 3 0.9999982
## 4 1.0001708
## 5 1.0000694
## 6 1.0003995
Default
Default$svm_pred_class <- predict(model_svm, Default)
Default$svm_dv<-c(attr(predicted_svm, "decision.values"))
Default
logit_model<-glm(default~student+balance+income, # generalized linear models
family="binomial", # specifying error distribution
data=Default) # dataset
summary(logit_model)
##
## Call:
## glm(formula = default ~ student + balance + income, family = "binomial",
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4691 -0.1418 -0.0557 -0.0203 3.7383
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.087e+01 4.923e-01 -22.080 < 2e-16 ***
## studentYes -6.468e-01 2.363e-01 -2.738 0.00619 **
## balance 5.737e-03 2.319e-04 24.738 < 2e-16 ***
## income 3.033e-06 8.203e-06 0.370 0.71152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1571.5 on 9996 degrees of freedom
## AIC: 1579.5
##
## Number of Fisher Scoring iterations: 8
Default$log_odd<-predict(logit_model) # get predicted log odds (default)
Default$logit_pred_prob<-predict(logit_model,type="response") # get predicted probabilities
glimpse(Default)
## Rows: 10,000
## Columns: 11
## $ default <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No~
## $ student <fct> No, Yes, No, No, No, Yes, No, Yes, No, No, Yes, Yes, N~
## $ balance <dbl> 729.5265, 817.1804, 1073.5492, 529.2506, 785.6559, 919~
## $ income <dbl> 44361.625, 12106.135, 31767.139, 35704.494, 38463.496,~
## $ ct_pred_prob <dbl> 0.01760708, 0.01760708, 0.01760708, 0.01760708, 0.0176~
## $ ct_pred_class <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", ~
## $ rf_vote <dbl> 0.000000000, 0.000000000, 0.005319149, 0.000000000, 0.~
## $ svm_pred_class <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, No~
## $ svm_dv <dbl> 1.0000632, 1.0004151, 0.9999982, 1.0001708, 1.0000694,~
## $ log_odd <dbl> -6.549544, -6.791338, -4.614261, -7.724689, -6.245449,~
## $ logit_pred_prob <dbl> 1.428724e-03, 1.122204e-03, 9.812272e-03, 4.415893e-04~
Default%>%
select("default","student","log_odd","logit_pred_prob")
With the predicted probabilities, you can sort customers by the predicted probability of default in descending order.
Default%>%
arrange(desc(logit_pred_prob))%>%
select(default, student, balance, income, ct_pred_prob, ct_pred_class,logit_pred_prob)
And use a different cut-off for class prediction using
ifelse().
Default$logit_pred_class<-ifelse(Default$logit_pred_prob>0.5,"Yes","No")
Default
.
Let’s try to find the best set of parameters through model validation.
Let’s select 2,000 indices out of 10,000 (20% of dataset). Using these indices, we will create a test and a training dataset.
set.seed(1) # set a random seed
index <- sample(10000, 2000) # random selection of indices.
Check index. It contains 2,000 random numbers.
index
## [1] 1017 8004 4775 9725 8462 4050 8789 1301 8522 1799 9941 8229 1129 8921
## [15] 878 8677 7845 5922 6526 5071 2900 7075 4650 9638 2159 3476 1948 2580
## [29] 1530 7289 4633 6519 4344 1222 2858 5400 9888 526 1069 5522 8214 5838
## [43] 5862 6337 5491 9320 7494 9175 1791 9831 4939 465 7976 9392 3863 9326
## [57] 8276 1895 3101 3990 6877 6764 1328 8749 8479 6473 6995 9714 858 6763
## [71] 9182 316 733 6273 4907 5051 2330 9359 6429 3992 1706 501 6655 536
## [85] 6881 7507 7514 3747 5163 29 1942 4934 1820 2281 9509 1966 8561 4182
## [99] 5603 5742 8547 5939 6956 9265 8553 9532 9458 6902 2866 8943 219 8327
## [113] 532 3123 9104 6611 2178 5936 7233 5932 4455 2153 1148 1101 1242 1218
## [127] 4115 8465 418 6563 867 4499 3821 9803 5897 818 4730 664 719 500
## [141] 3045 6816 6373 8615 8159 8613 9181 3598 3700 5136 9346 8696 785 6789
## [155] 7131 8053 1572 4401 1833 2461 4225 309 7892 4078 7146 8633 3189 470
## [169] 1360 1822 1790 8541 8066 3144 9086 4686 8666 9855 5048 455 6500 7793
## [183] 3306 6217 8207 4158 4486 9251 5987 9988 5447 6809 8174 4096 6887 4084
## [197] 2110 1172 797 9966 9788 4072 5057 7452 5388 2762 8802 1265 7812 9903
## [211] 2265 2156 6415 5390 9891 7151 5458 5729 1760 5704 5685 4664 4075 5474
## [225] 7525 4212 3070 5353 7439 9655 9413 462 6443 4049 6657 9690 9765 7888
## [239] 2033 731 7039 2813 3805 4771 966 9383 6193 5591 5793 8483 4749 8439
## [253] 2482 6885 3388 3418 2722 2884 56 7193 934 7640 8109 4590 7883 3552
## [267] 5123 6803 4257 127 717 9644 7715 7447 676 9969 5795 148 121 8361
## [281] 8790 1491 2815 1115 6308 3616 3551 6525 436 2137 8137 533 9573 2752
## [295] 5592 9038 5191 6459 8411 9050 3565 570 6976 1926 5666 1813 7681 2833
## [309] 7885 5694 7981 5161 7560 8473 4063 7918 9864 724 1354 8684 6164 3255
## [323] 9779 5199 8706 6017 9445 2016 132 6186 5094 1659 6068 6438 9613 2312
## [337] 6321 5040 1839 3505 1838 5259 8512 4211 6595 9434 6990 6303 2167 2543
## [351] 5914 8301 3760 6663 2587 8945 9926 7935 393 8840 7046 7923 4208 5147
## [365] 4318 6410 4254 9746 2778 9673 7343 6103 7334 9331 4436 6236 5772 99
## [379] 7684 7983 3200 3822 8090 6989 5763 7805 1190 1459 5117 1776 116 7360
## [393] 9618 3157 1749 1326 1847 9424 1018 9858 383 8920 771 5207 484 4599
## [407] 6449 3887 2563 58 4801 2021 9736 2170 1803 1124 313 6790 822 6625
## [421] 5480 5131 3382 9607 81 3328 3126 2194 5235 5663 2796 6472 2775 7174
## [435] 1401 8627 5605 7961 3360 8426 1779 7656 4278 8253 4606 1089 8841 1861
## [449] 9890 7374 7893 9568 2992 3797 6079 1177 8209 1057 3706 6440 7123 2257
## [463] 1525 6420 5809 2079 9847 8987 7009 8765 3037 6485 3558 4568 7842 2351
## [477] 7666 3651 6266 1871 5034 5005 8060 363 8816 2573 9459 9647 3329 2373
## [491] 702 1378 959 2273 9874 8589 5031 9791 744 564 9796 2290 3482 5327
## [505] 5287 87 4313 5633 860 5620 7817 6134 2446 628 5368 1104 1493 4062
## [519] 3926 4784 2811 1473 2572 4364 9545 8863 4933 2569 9410 4745 2551 128
## [533] 7211 1435 102 7255 4066 3457 453 4585 6138 9321 8146 6050 815 8766
## [547] 644 7238 8240 8151 3437 7120 9366 2608 4358 1590 7368 3352 9027 2899
## [561] 9734 1821 9531 7917 8793 1759 8910 2107 1019 2020 8735 7757 9592 296
## [575] 5144 5853 764 5566 8314 3047 8690 1043 376 4562 6101 919 9876 1487
## [589] 6109 5999 5661 7025 854 8717 9954 108 9965 3862 3018 2285 8588 144
## [603] 3733 5254 9739 8444 2069 4963 133 9792 386 1802 8482 8962 5756 7500
## [617] 9094 2158 4297 487 7379 3254 1421 908 3363 247 8054 8649 9183 6320
## [631] 8775 647 2732 492 3807 3586 1620 7348 3129 5435 553 1864 1443 5238
## [645] 7445 7200 1031 4330 2531 3417 9447 1587 9737 1651 3402 6818 5776 5195
## [659] 7512 1630 7998 8864 5527 2136 9477 5148 8343 1524 8978 3946 827 4876
## [673] 2616 2684 8825 6167 8968 7419 1131 4333 1890 5334 4404 2997 7637 6398
## [687] 9846 8378 4407 2019 2548 5750 1967 4279 9667 8539 4490 6557 8781 5305
## [701] 6583 2161 5630 4210 8692 1216 2395 8423 129 2424 3145 2792 314 4324
## [715] 8881 8101 823 8164 407 5024 9741 280 3579 517 3793 3590 8861 851
## [729] 9914 3465 9367 6314 7652 793 703 324 8427 3641 3390 2106 787 8068
## [743] 1373 1592 660 5006 1038 6574 9405 64 2131 9112 626 8498 6850 5607
## [757] 5502 4848 2506 6293 8097 8567 5920 284 6940 1863 1950 8094 8608 8168
## [771] 3210 4759 4973 4023 9777 9935 7173 7555 9924 9491 4206 3729 6849 4400
## [785] 5512 7891 384 6776 5330 9884 5846 5943 9243 8644 573 1611 5165 8345
## [799] 1700 3781 479 1922 7914 8544 8389 4083 3298 9904 5863 4186 4064 5671
## [813] 2664 8971 8827 249 9740 7239 101 7965 3665 520 5670 8313 9339 3415
## [827] 7435 650 617 4676 1846 4223 3680 7568 1428 7300 4105 4944 5361 3338
## [841] 5864 9101 9228 5337 4754 8778 812 2480 5956 338 7809 5845 6512 8398
## [855] 8634 4777 8147 2267 6919 8782 9594 7310 1372 6063 6081 9009 2823 220
## [869] 7095 6558 5413 6584 4572 2214 4637 8744 3663 6581 8714 611 7671 217
## [883] 9290 8654 4819 6239 7309 3631 6515 6531 7826 271 1299 9808 2871 6799
## [897] 7821 4171 7304 5957 4893 4733 27 8753 6448 4578 4385 9167 2937 4292
## [911] 3766 7228 7261 9839 8443 5061 9080 3115 2223 5912 9398 1598 8405 5575
## [925] 5934 8693 1088 4721 2761 9051 3572 1071 6348 9816 503 2541 5466 2439
## [939] 5857 2864 979 3152 9210 2284 610 9162 8682 7574 6226 2116 656 9998
## [953] 4327 3702 6693 5135 5176 7486 1252 2826 9325 2391 2678 9319 413 8395
## [967] 8671 6660 7785 7987 8331 1259 5189 7740 4901 4103 9677 4895 7799 1654
## [981] 427 9276 8131 582 8768 8045 1817 225 6247 5578 8377 2841 2426 4609
## [995] 5770 8437 3679 3842 2928 450 6008 665 1670 8806 1090 5281 5253 3715
## [1009] 4836 6357 4589 4596 2403 8231 7008 1552 721 5761 9156 7545 8578 5473
## [1023] 4896 6883 5767 6417 9630 327 6750 1205 3237 1768 4038 6903 5049 2296
## [1037] 6834 9565 7658 3316 6160 838 7808 2490 4474 4703 5396 4649 4630 3977
## [1051] 556 4346 306 945 2772 4826 1279 8032 8941 4286 8416 9053 7040 9702
## [1065] 4904 8818 2765 2240 4588 534 2423 6265 9660 1526 2389 5749 401 6987
## [1079] 4780 3929 34 6387 3728 9610 6093 2751 7986 347 3984 1302 8763 4866
## [1093] 3580 5564 7895 5727 9384 5418 68 4618 9284 5184 8365 391 2350 505
## [1107] 4475 3983 9845 3553 935 1316 5240 7565 8926 3936 5157 2408 90 2600
## [1121] 2499 1689 5101 1954 3387 4142 1511 773 3375 2145 40 41 5291 4239
## [1135] 113 3890 6744 2255 3158 6262 8325 7004 5735 1641 4527 9621 7663 5394
## [1149] 5470 4612 9695 3159 6196 2705 6480 1830 6963 1581 5958 5007 2904 4778
## [1163] 667 3435 4811 4611 5260 1778 2889 5971 9606 9340 7758 2002 4298 9127
## [1177] 8117 1792 7298 7598 6896 759 1560 6211 6251 1258 9774 4107 6238 5948
## [1191] 2617 3920 699 8532 3792 3426 9942 7182 3814 6384 5650 2235 3993 1376
## [1205] 9790 3845 7871 3956 4953 1098 5794 9357 2934 2538 4724 1000 5728 6335
## [1219] 6035 9863 9834 7945 619 5192 1144 2182 1636 2282 2185 7613 4258 3531
## [1233] 6461 6762 9656 6592 5307 9875 3561 2129 8501 1965 1742 310 5814 7662
## [1247] 616 1092 4086 6060 2211 4119 8942 8739 9414 756 7721 1818 1782 1198
## [1261] 7993 6968 8096 5993 6264 8432 1619 7571 4773 6958 3309 5468 1929 6128
## [1275] 649 2602 8598 965 5265 54 4634 3257 8916 9002 5245 6586 2595 3266
## [1289] 830 9345 569 441 1983 7 9448 229 5316 2930 9171 8243 9688 5581
## [1303] 2301 8705 1362 6284 2409 4110 1951 2124 6430 672 2766 2379 1996 6488
## [1317] 8656 1961 1998 8075 335 7263 5788 9421 9799 1106 4150 3818 5836 2744
## [1331] 3051 44 7628 6623 6275 1786 1935 9141 2380 7436 6571 5257 6791 3804
## [1345] 5797 3235 5318 9198 7995 3327 5138 9254 8524 1936 1949 2840 8857 3879
## [1359] 7936 5686 2356 8977 8451 7523 406 5765 5186 4804 3324 697 3915 3447
## [1373] 2924 2076 2837 1154 1748 9200 692 4451 6970 4894 1035 7218 3661 5093
## [1387] 4315 7680 3731 1457 5003 3058 2044 2789 7621 3442 7421 9657 8730 7974
## [1401] 8413 3245 7471 5226 9248 498 3143 1221 2962 9262 9753 3100 3318 9920
## [1415] 3229 5807 1806 7216 3039 4713 5268 5949 9640 2003 9225 4632 8587 4807
## [1429] 6537 1213 3765 1855 7109 6207 5562 9513 5843 9095 3625 9772 6616 1610
## [1443] 5889 463 7903 2788 8866 1603 8517 263 3768 1386 9873 9220 2382 4419
## [1457] 9099 8689 8133 3434 7696 9264 36 990 9947 9281 6058 1984 8322 5653
## [1471] 8724 1828 5841 8407 6514 2645 2378 3212 9299 6313 5089 1718 8275 9238
## [1485] 7303 5411 7400 1229 9149 6383 761 4507 7837 3414 4736 2419 1186 5963
## [1499] 2206 7679 1692 168 5940 8425 9012 8014 9523 1732 615 5533 7114 1368
## [1513] 4418 2571 967 3381 6696 881 8976 5158 8531 7738 8762 7464 2336 9597
## [1527] 1781 7503 4860 9268 259 8653 5438 5227 5941 2567 9824 4965 396 7375
## [1541] 5239 1389 6344 6522 6432 3750 7739 8890 8518 6851 2410 6305 7410 7697
## [1555] 3873 6061 4888 1920 3698 9971 6589 6237 8333 9184 9322 7933 8548 6939
## [1569] 6003 5479 1600 2436 2437 3119 4742 2919 7451 2432 5677 9350 8119 1274
## [1583] 3840 4654 7033 2109 7814 2258 4660 2486 2652 149 7117 7267 3204 9266
## [1597] 3970 7381 6311 1907 185 7332 9219 5651 1937 1116 5014 9206 9973 1731
## [1611] 7101 2686 4689 7566 5406 1370 3852 977 7276 8139 4176 38 7912 5621
## [1625] 7154 6298 6444 2921 4433 170 7116 6052 334 1656 1424 5617 6394 8655
## [1639] 7880 3031 2261 6684 9120 3052 2429 3489 9146 2025 9807 7462 9960 1677
## [1653] 4506 3290 5560 2008 9293 9007 9070 1831 9335 6649 1568 7245 2148 8516
## [1667] 1697 4125 8586 2053 2238 3446 6889 8878 6112 9090 6487 8507 7425 8612
## [1681] 741 2985 6386 4471 746 7000 9375 2058 3431 9297 6878 4178 3726 5350
## [1695] 8640 7015 7413 2370 5310 9089 9232 5196 7150 476 8858 1865 6983 2401
## [1709] 8853 889 6257 9294 1693 4723 226 8514 3184 8015 9380 8337 97 4852
## [1723] 7894 448 4240 842 1579 1078 9123 4482 5632 5100 9044 1065 7386 6969
## [1737] 8550 2784 7647 2164 1297 7387 8093 7415 2530 8984 2674 3922 524 4758
## [1751] 7763 6156 3154 4037 7176 5030 1801 4166 3127 8767 9716 7710 2445 5082
## [1765] 1026 3543 3165 1577 3575 8672 1845 46 5917 8219 726 2725 2892 1023
## [1779] 8710 3954 5819 4477 7230 7449 2844 283 457 1275 333 4692 7124 7752
## [1793] 6362 392 4288 1726 1158 728 5830 1853 5045 7626 2102 2377 2638 3876
## [1807] 4007 9632 7119 3203 2213 4547 2797 3669 7539 1076 1261 7776 2790 1488
## [1821] 5931 7736 6745 1953 3439 2397 6954 6869 3034 456 3503 5764 4430 3219
## [1835] 251 6505 1022 3104 7184 5893 9191 668 2367 5262 796 8048 6900 2637
## [1849] 5530 1495 4799 5970 6719 8986 8940 3913 8347 2374 1130 9363 7085 3347
## [1863] 4045 7256 5609 4058 1349 2472 6882 682 6926 9669 5676 6242 2335 3530
## [1877] 241 5456 6587 3384 871 5918 1164 9418 4360 2708 4111 2673 7473 6681
## [1891] 1663 5540 9580 4615 5249 4863 4917 7140 874 2323 6839 826 7266 2359
## [1905] 4428 3075 9163 63 1245 4003 6573 6997 2504 4146 4487 2619 1974 8171
## [1919] 6307 740 326 5628 3752 6051 925 80 5044 4051 778 5321 7544 4625
## [1933] 9151 7534 2413 1133 4458 2967 112 6208 5433 2644 4222 7306 214 2880
## [1947] 5272 2568 9031 7083 4899 3522 3193 8596 2842 1193 9143 1874 4485 931
## [1961] 3596 4740 2781 4753 4669 8435 8788 9701 7265 1567 952 6646 3690 4550
## [1975] 8088 5667 5058 8124 9169 3017 4290 8200 2857 5367 9406 2094 3025 369
## [1989] 5319 9399 6091 1678 9776 4303 1517 637 6310 7034 6539 1391
The following codes will select the data instances in the rows in
index and save them as test. The rest will be
saved as training.
test <- Default[index,c("default","student","balance","income")] # save 20% as a test dataset
#test1 <- Default[index, ]
training <-Default[-index,c("default","student","balance","income")] # save the rest as a training set
test
training
Alternatively,
test<-Default%>%
filter(row_number() %in% index)%>%
select("default","student","balance","income")
training<-Default%>%
select("default","student","balance","income")%>%
setdiff(test)
Build a model using the training set. This part is the same as before
except we use training instead of the entire dataset.
training_model<-rpart(default~student+balance+income,
data=training,
method="class",
control=rpart.control(cp=0.03))
rpart.plot(training_model)
This model does not look much different from what we had. But we now have 20% of dataset that were not used when we built this model.
Now, the model will be evaluated on a test data. For this, we apply
the model to the test dataset and get the predicted values. It can be
done by providing a dataset name with a model to predict()
function.
test$ct_pred_prob<-predict(training_model,test)[,2]
head(test)
test$ct_pred_class<-predict(training_model,test,type="class")
test
Check the accuracy of the model by comparing the the actual values (default) and the predicted values (ct_pred_class).
table(test$default==test$ct_pred_class)
##
## FALSE TRUE
## 63 1937
1947/2000
## [1] 0.9735
sum(predict(training_model, test, type="class")==test$default)/nrow(test)
## [1] 0.9685
sum(predict(training_model, training, type="class")==training$default)/nrow(training)
## [1] 0.975
###### holdout validation (Back to slide)********************
You may also want to check a confusion table.
table(test$ct_pred_class,test$default, dnn=c("predicted","actual")) # confusion table on test data
## actual
## predicted No Yes
## No 1916 49
## Yes 14 21
set.seed(1) # set a random seed
full_tree<-rpart(default~student+balance+income,
data=training,
method="class",
control=rpart.control(cp=0)) #complexity increases as cp value ~ zero
rpart.plot(full_tree)
xerror is cross-validated relative error rates, and
xstd is its standard deviation.
printcp(full_tree) # xerror, xstd - cross validation results
##
## Classification tree:
## rpart(formula = default ~ student + balance + income, data = training,
## method = "class", control = rpart.control(cp = 0))
##
## Variables actually used in tree construction:
## [1] balance income
##
## Root node error: 263/8000 = 0.032875
##
## n= 8000
##
## CP nsplit rel error xerror xstd
## 1 0.1026616 0 1.00000 1.00000 0.060641
## 2 0.0342205 2 0.79468 0.84030 0.055739
## 3 0.0114068 3 0.76046 0.80608 0.054624
## 4 0.0038023 5 0.73764 0.82129 0.055122
## 5 0.0012674 8 0.72624 0.84791 0.055983
## 6 0.0000000 14 0.71863 0.93156 0.058597
Using plotcp(), you can check how the cross-validation
error rate changes as the complexity of the model increases. In this
chart, x-axis is model complexity, and y-axis is xerror rate (from
cross-validation). The bars indicate standard deviation.
plotcp(full_tree)
We may choose the cp value that minimizes cross-validation errors. However, it may not be always the best option. As you can see, the error rate with 4 splits is within the rage of standard deviation of the minimum error rate. You may want to choose the one with 4 splits for the ease of interpretation.
min_xerror<-full_tree$cptable[which.min(full_tree$cptable[,"xerror"]),]
min_xerror
## CP nsplit rel error xerror xstd
## 0.01140684 3.00000000 0.76045627 0.80608365 0.05462358
# prune tree with minimum cp value
min_xerror_tree<-prune(full_tree, cp=min_xerror[1])
rpart.plot(min_xerror_tree)
Let’s consider mim_xerror_tree as the best pruned tree, and get the prediction.
bp_tree<-min_xerror_tree
test$ct_bp_pred_prob<-predict(bp_tree,test)[,2]
test$ct_bp_pred_class=ifelse(test$ct_bp_pred_prob>0.5,"Yes","No")
head(test)
table(test$ct_bp_pred_class!=test$default) # error rate
##
## FALSE TRUE
## 1937 63
53/2000
## [1] 0.0265
1947/2000
## [1] 0.9735
sum(predict(bp_tree, test, type="class")!=test$default)/nrow(test)
## [1] 0.0315
table(test$ct_bp_pred_class,test$default, dnn=c("predicted","actual")) # confusion table on test data
## actual
## predicted No Yes
## No 1916 49
## Yes 14 21
Following a similar process, we can validate the performance of a random forest model.
set.seed(1)
rf_training_model<-randomForest(default~income+balance+student, # model formula
data=training, # use a training dataset for building a model
ntree=500,
cutoff=c(0.5,0.5),
mtry=2, # number of variables considered in each split (useful when lots of variables)
importance=TRUE)
rf_training_model
##
## Call:
## randomForest(formula = default ~ income + balance + student, data = training, ntree = 500, cutoff = c(0.5, 0.5), mtry = 2, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 2.96%
## Confusion matrix:
## No Yes class.error
## No 7671 66 0.008530438
## Yes 171 92 0.650190114
sum(predict(rf_training_model,test,type="class")!=test$default)/nrow(test)
## [1] 0.033
The tuneRF() function has an argument, ntreeTry that defaults to 50 trees. Set nTreeTry = 500 to train a random forest model of the same size as you previously did. After tuning the forest, this function will plot model performance (OOB error) as a function of the mtry values that were evaluated.
# Execute the tuning process
set.seed(1)
res <- tuneRF(x = training%>%select(-default),
y = training$default,mtryStart=2,
ntreeTry = 500)
## mtry = 2 OOB error = 2.97%
## Searching left ...
## mtry = 1 OOB error = 2.75%
## 0.07563025 0.05
## Searching right ...
## mtry = 3 OOB error = 3.12%
## -0.1363636 0.05
rf_best_model<-randomForest(default~income+balance+student, # model formula
data=training, # use a training dataset for building a model
ntree=500,
cutoff=c(0.5,0.5),
mtry=1,
importance=TRUE)
rf_best_model
##
## Call:
## randomForest(formula = default ~ income + balance + student, data = training, ntree = 500, cutoff = c(0.5, 0.5), mtry = 1, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 2.7%
## Confusion matrix:
## No Yes class.error
## No 7711 26 0.003360476
## Yes 190 73 0.722433460
test$rf_pred_prob<-predict(rf_best_model,test,type="prob")[,2] #use a test dataset for model evaluation
test$rf_pred_class<-predict(rf_best_model,test,type="class")
glimpse(test)
## Rows: 2,000
## Columns: 10
## $ default <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N~
## $ student <fct> No, No, No, No, Yes, No, No, No, Yes, No, Yes, Yes, N~
## $ balance <dbl> 825.5133, 642.9997, 615.7043, 913.5872, 1499.7247, 35~
## $ income <dbl> 24905.23, 41473.51, 39376.39, 46907.23, 13190.65, 350~
## $ ct_pred_prob <dbl> 0.01737452, 0.01737452, 0.01737452, 0.01737452, 0.017~
## $ ct_pred_class <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N~
## $ ct_bp_pred_prob <dbl> 0.01737452, 0.01737452, 0.01737452, 0.01737452, 0.017~
## $ ct_bp_pred_class <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No",~
## $ rf_pred_prob <dbl> 0.000, 0.000, 0.000, 0.000, 0.004, 0.000, 0.002, 0.00~
## $ rf_pred_class <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N~
table(test$default==test$rf_pred_class)
##
## FALSE TRUE
## 61 1939
56/2000
## [1] 0.028
1944/2000
## [1] 0.972
We can tune SVM models using tune function. Set a range
of search values for the parameter. It builds an SVM model for each
possible combination of parameter values and evaluate accuracy. It will
return the parameter combination that yields the best accuracy.
svm_tune <- tune(svm, # find a best set of parameters for the svm model
default~student+balance+income,
data = training,
kernel="radial",
ranges = list(cost = 10^(-5:0))) # specifying the ranges of parameters
# in the penalty function to be examined
# you may wish to increase the search space
print(svm_tune) # best parameters for the model
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.02825
best_svm_mod <- svm_tune$best.model # best model is saved in best.model
hist(best_svm_mod$decision.values)
#sum(best_svm_mod$decision.values < 0)
table(best_svm_mod$fitted)
##
## No Yes
## 7937 63
table(training$default==best_svm_mod$fitted)
##
## FALSE TRUE
## 222 7778
224/8000
## [1] 0.028
test$svm_pred_class <- predict(best_svm_mod, test) # save the predicted class by the svm model
test$svm_dv<-as.numeric(attr(predict(best_svm_mod, test, decision.values = TRUE),"decision.values"))
glimpse(test)
## Rows: 2,000
## Columns: 12
## $ default <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N~
## $ student <fct> No, No, No, No, Yes, No, No, No, Yes, No, Yes, Yes, N~
## $ balance <dbl> 825.5133, 642.9997, 615.7043, 913.5872, 1499.7247, 35~
## $ income <dbl> 24905.23, 41473.51, 39376.39, 46907.23, 13190.65, 350~
## $ ct_pred_prob <dbl> 0.01737452, 0.01737452, 0.01737452, 0.01737452, 0.017~
## $ ct_pred_class <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N~
## $ ct_bp_pred_prob <dbl> 0.01737452, 0.01737452, 0.01737452, 0.01737452, 0.017~
## $ ct_bp_pred_class <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No",~
## $ rf_pred_prob <dbl> 0.000, 0.000, 0.000, 0.000, 0.004, 0.000, 0.002, 0.00~
## $ rf_pred_class <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N~
## $ svm_pred_class <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N~
## $ svm_dv <dbl> 1.134960, 1.167183, 1.195762, 1.012859, 1.105100, 1.2~
table(test$default==test$svm_pred_class)
##
## FALSE TRUE
## 60 1940
56/2000
## [1] 0.028
logit_training_model<-glm(default~student+balance+income,family="binomial",data=training)
summary(logit_training_model)
##
## Call:
## glm(formula = default ~ student + balance + income, family = "binomial",
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1617 -0.1412 -0.0545 -0.0195 3.7829
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.072e+01 5.622e-01 -19.072 <2e-16 ***
## studentYes -6.461e-01 2.657e-01 -2.432 0.015 *
## balance 5.784e-03 2.647e-04 21.854 <2e-16 ***
## income -4.445e-06 9.487e-06 -0.469 0.639
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2313.6 on 7999 degrees of freedom
## Residual deviance: 1247.6 on 7996 degrees of freedom
## AIC: 1255.6
##
## Number of Fisher Scoring iterations: 8
test$logit_pred_prob<-predict(logit_training_model, test, type="response") #get predicted probability
test$logit_pred_class<-ifelse(test$logit_pred_prob>0.5,"Yes","No") #get predicted class
glimpse(test)
## Rows: 2,000
## Columns: 14
## $ default <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N~
## $ student <fct> No, No, No, No, Yes, No, No, No, Yes, No, Yes, Yes, N~
## $ balance <dbl> 825.5133, 642.9997, 615.7043, 913.5872, 1499.7247, 35~
## $ income <dbl> 24905.23, 41473.51, 39376.39, 46907.23, 13190.65, 350~
## $ ct_pred_prob <dbl> 0.01737452, 0.01737452, 0.01737452, 0.01737452, 0.017~
## $ ct_pred_class <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N~
## $ ct_bp_pred_prob <dbl> 0.01737452, 0.01737452, 0.01737452, 0.01737452, 0.017~
## $ ct_bp_pred_class <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No",~
## $ rf_pred_prob <dbl> 0.000, 0.000, 0.000, 0.000, 0.004, 0.000, 0.002, 0.00~
## $ rf_pred_class <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N~
## $ svm_pred_class <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N~
## $ svm_dv <dbl> 1.134960, 1.167183, 1.195762, 1.012859, 1.105100, 1.2~
## $ logit_pred_prob <dbl> 2.332228e-03, 7.550376e-04, 6.508659e-04, 3.515907e-0~
## $ logit_pred_class <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No",~
table(test$default==test$logit_pred_class)
##
## FALSE TRUE
## 57 1943
57/2000
## [1] 0.0285
#sum(test$logit_pred_class!=test$default)/nrow(test)
To improve the logit regression performance, it’s important to understand which variables should be included. Step-wise regression helps us select variables - assess model performance by adding variable and compare.
# Specify a null model with no predictors
null_model <- glm(default~1, data = training, family = "binomial")
# Specify the full model using all of the potential predictors
full_model <- glm(default~student+balance+income, data = training, family = "binomial")
# Use a forward stepwise algorithm to build a parsimonious model
forward_model <- step(null_model, scope = list(lower = null_model, upper = full_model), direction = "forward")
## Start: AIC=2315.57
## default ~ 1
##
## Df Deviance AIC
## + balance 1 1259.5 1263.5
## + student 1 2292.9 2296.9
## + income 1 2299.7 2303.7
## <none> 2313.6 2315.6
##
## Step: AIC=1263.54
## default ~ balance
##
## Df Deviance AIC
## + student 1 1247.8 1253.8
## + income 1 1253.5 1259.5
## <none> 1259.5 1263.5
##
## Step: AIC=1253.84
## default ~ balance + student
##
## Df Deviance AIC
## <none> 1247.8 1253.8
## + income 1 1247.6 1255.6
summary(forward_model)
##
## Call:
## glm(formula = default ~ balance + student, family = "binomial",
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1738 -0.1413 -0.0546 -0.0195 3.7753
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.090e+01 4.220e-01 -25.826 < 2e-16 ***
## balance 5.784e-03 2.647e-04 21.850 < 2e-16 ***
## studentYes -5.474e-01 1.630e-01 -3.359 0.000781 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2313.6 on 7999 degrees of freedom
## Residual deviance: 1247.8 on 7997 degrees of freedom
## AIC: 1253.8
##
## Number of Fisher Scoring iterations: 8
# Use a forward stepwise algorithm to build a parsimonious model
backward_model <- step(full_model, scope = list(lower = null_model, upper = full_model), direction = "backward")
## Start: AIC=1255.62
## default ~ student + balance + income
##
## Df Deviance AIC
## - income 1 1247.8 1253.8
## <none> 1247.6 1255.6
## - student 1 1253.5 1259.5
## - balance 1 2292.8 2298.8
##
## Step: AIC=1253.84
## default ~ student + balance
##
## Df Deviance AIC
## <none> 1247.8 1253.8
## - student 1 1259.5 1263.5
## - balance 1 2292.9 2296.9
summary(backward_model)
##
## Call:
## glm(formula = default ~ student + balance, family = "binomial",
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1738 -0.1413 -0.0546 -0.0195 3.7753
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.090e+01 4.220e-01 -25.826 < 2e-16 ***
## studentYes -5.474e-01 1.630e-01 -3.359 0.000781 ***
## balance 5.784e-03 2.647e-04 21.850 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2313.6 on 7999 degrees of freedom
## Residual deviance: 1247.8 on 7997 degrees of freedom
## AIC: 1253.8
##
## Number of Fisher Scoring iterations: 8
logit_best_model<-glm(default~student+balance,family="binomial",data=training)
summary(logit_best_model)
##
## Call:
## glm(formula = default ~ student + balance, family = "binomial",
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1738 -0.1413 -0.0546 -0.0195 3.7753
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.090e+01 4.220e-01 -25.826 < 2e-16 ***
## studentYes -5.474e-01 1.630e-01 -3.359 0.000781 ***
## balance 5.784e-03 2.647e-04 21.850 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2313.6 on 7999 degrees of freedom
## Residual deviance: 1247.8 on 7997 degrees of freedom
## AIC: 1253.8
##
## Number of Fisher Scoring iterations: 8
test$logit_pred_prob<-predict(logit_best_model,test,type="response")
test$logit_pred_class<-ifelse(test$logit_pred_prob>0.5,"Yes","No")
glimpse(test)
## Rows: 2,000
## Columns: 14
## $ default <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N~
## $ student <fct> No, No, No, No, Yes, No, No, No, Yes, No, Yes, Yes, N~
## $ balance <dbl> 825.5133, 642.9997, 615.7043, 913.5872, 1499.7247, 35~
## $ income <dbl> 24905.23, 41473.51, 39376.39, 46907.23, 13190.65, 350~
## $ ct_pred_prob <dbl> 0.01737452, 0.01737452, 0.01737452, 0.01737452, 0.017~
## $ ct_pred_class <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N~
## $ ct_bp_pred_prob <dbl> 0.01737452, 0.01737452, 0.01737452, 0.01737452, 0.017~
## $ ct_bp_pred_class <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No",~
## $ rf_pred_prob <dbl> 0.000, 0.000, 0.000, 0.000, 0.004, 0.000, 0.002, 0.00~
## $ rf_pred_class <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N~
## $ svm_pred_class <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N~
## $ svm_dv <dbl> 1.134960, 1.167183, 1.195762, 1.012859, 1.105100, 1.2~
## $ logit_pred_prob <dbl> 2.184175e-03, 7.611338e-04, 6.500509e-04, 3.629822e-0~
## $ logit_pred_class <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No",~
table(test$default==test$logit_pred_class)
##
## FALSE TRUE
## 57 1943
57/2000
## [1] 0.0285
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
ct_roc<-roc(test$default,test$ct_pred_prob,auc=TRUE) #two arguments for roc function: true value, numeric result from model
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
rf_roc<-roc(test$default,test$rf_pred_prob,auc=TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
logit_roc<-roc(test$default,test$logit_pred_prob,auc=TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
svm_roc<-roc(test$default,test$svm_dv,auc=TRUE)
## Setting levels: control = No, case = Yes
## Setting direction: controls > cases
plot(ct_roc,print.auc=TRUE,col="blue")
plot(rf_roc,print.auc=TRUE,print.auc.y=.4,col="green", add=TRUE)
plot(logit_roc,print.auc=TRUE,print.auc.y=.3, col="red",add=TRUE)
plot(svm_roc,print.auc=TRUE,print.auc.y=.2, col="black",add=TRUE)